Importing libraries

library(readxl)
library(dplyr)
library(plyr)
library(cluster)
library(ClusterR)
library(ggplot2) 
library(tidyverse)
library(moments)
library(forcats) 
library(gridExtra)
library(RColorBrewer)
library(rsample)
library(broom.mixed)
library(Metrics)
library(corrplot)
library(caret)
library(factoextra)
library(ggpubr)
library(rpart)
library(rpart.plot)
library(jtools)
library(devtools)
library(ggbiplot)
install_github("vqv/ggbiplot")

1. Please find your original dataset or datasets; and describe your data in the first step.

Our dataset is a multinational company Human Resources (HR) data which includes employee attributes. This dataset contains 14999 unique employee and 11 attribute of employees such as:

Emp_id = Employee Id

satisfaction_level = Satisfaction of Employee from Company

last_evaluation = Employee Id

number_project = Number of the projects that an employee works

average_montly_hours = Average Monthly hours of an employee

Work_accident = True False value of question: “Is employeee had an accident?”

left = True False value of question: “Is employeee left?”

promotion_last_5years = True False value of question: “Is employeee get a promotion in last 5 years?”

Department = Department of the Employee

salary = Salary of the Employee (as: low,medium,high)

data <- read_excel("HR_Employee_Data.xlsx")
glimpse(data,width = getOption("width"))
## Rows: 14,999
## Columns: 11
## $ Emp_Id                <chr> "IND02438", "IND28133", "IND07164", "IND30478", …
## $ satisfaction_level    <dbl> 0.38, 0.80, 0.11, 0.72, 0.37, 0.41, 0.10, 0.92, …
## $ last_evaluation       <dbl> 0.53, 0.86, 0.88, 0.87, 0.52, 0.50, 0.77, 0.85, …
## $ number_project        <dbl> 2, 5, 7, 5, 2, 2, 6, 5, 5, 2, 2, 6, 4, 2, 2, 2, …
## $ average_montly_hours  <dbl> 157, 262, 272, 223, 159, 153, 247, 259, 224, 142…
## $ time_spend_company    <dbl> 3, 6, 4, 5, 3, 3, 4, 5, 5, 3, 3, 4, 5, 3, 3, 3, …
## $ Work_accident         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ left                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ promotion_last_5years <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Department            <chr> "sales", "sales", "sales", "sales", "sales", "sa…
## $ salary                <chr> "low", "medium", "medium", "low", "low", "low", …

2. Use “Exploratory data analysis”. Write down your comments.

Neden emp_id sildik? transformationlarda ne yaptık? neden yaptık?

sum(is.na(data))
## [1] 0
transformedData <- preProcess(data, method = "range")
data <- predict(transformedData, data)
data <- transform(data, salary = as.integer(as.factor(salary)))

data <- data%>% 
  select(-c(Emp_Id))

# Group data by department and summarize the count of each department
top_salaries <- data %>% 
  group_by(Department) %>% 
  dplyr::summarize(count = n()) %>% 
  arrange(desc(count))

# Select the top 5 job titles by count
top_5_data <- top_salaries[1:5, ]

glimpse(data)
## Rows: 14,999
## Columns: 10
## $ satisfaction_level    <dbl> 0.31868132, 0.78021978, 0.02197802, 0.69230769, …
## $ last_evaluation       <dbl> 0.265625, 0.781250, 0.812500, 0.796875, 0.250000…
## $ number_project        <dbl> 0.0, 0.6, 1.0, 0.6, 0.0, 0.0, 0.8, 0.6, 0.6, 0.0…
## $ average_montly_hours  <dbl> 0.2850467, 0.7757009, 0.8224299, 0.5934579, 0.29…
## $ time_spend_company    <dbl> 0.125, 0.500, 0.250, 0.375, 0.125, 0.125, 0.250,…
## $ Work_accident         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ left                  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ promotion_last_5years <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Department            <chr> "sales", "sales", "sales", "sales", "sales", "sa…
## $ salary                <int> 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …

3. Use some “visualization techniques” and talk about your data further

employment_type_count<-data%>%
group_by(Department)%>%
dplyr::summarise(count=n())%>%
mutate(Percent = paste(Department,":",round(100*count/sum(count)), "%"))%>%
mutate(prop = count / sum(count) *100)

ggplot(employment_type_count, aes(x="", y=prop, fill=Percent)) +
  geom_bar(stat="identity", width=1, color="black") +
  coord_polar("y", start=0) +
  theme_void()

This graph contains all departments. It shows the percentages of employees by each department.

ggplot(top_5_data) +
  geom_col(aes(x = reorder(Department, -count), y = count, fill = Department)) +
  geom_text(aes(x = reorder(Department, -count), y = count, label = count), vjust = 1.5, colour = "white", position = position_dodge(.9), size = 5) +
  ggtitle("Number of different positions in data science") +
  xlab("") +
  ylab("") +
  theme(axis.text.x = element_text(angle = 30, size = 10, color = "black", hjust = 0.5)) +
  theme(legend.position = "none")

There are 5 departments with the highest number of employees in the company. Based on this graph, the department with the most employees is the sales department, followed by the technical department.

4. Check your data for multicollinearity.

In this correlation matrix, the darker the color, the stronger correlation is. We can say that average_monthly_hours and number_project has positive correlation, satisfaction_level and left has a negative but not strong correlation.

data_cor<-data[, c(1:8, 10)]
cor <- cor(data_cor)
corrplot(cor, method = 'color')

## 5. Apply PCA

5.A.Use appropriate functions and arguments

yorummmmm

datapca <- prcomp(data_cor, scale=TRUE,center = TRUE)
#reverse the signs
datapca$rotation <- -1*datapca$rotation #eigenvectors in R point in the negative direction by default
head(datapca$rotation)
##                              PC1        PC2        PC3          PC4
## satisfaction_level   -0.18916086  0.6084103 -0.1442392  0.002025143
## last_evaluation       0.46380764  0.3118634 -0.1548850  0.017424116
## number_project        0.55705957  0.1219683 -0.0106627  0.023291069
## average_montly_hours  0.52562028  0.1779333 -0.1097887  0.034336941
## time_spend_company    0.33377147 -0.1177003  0.4409212 -0.054714112
## Work_accident        -0.06439211  0.2808987  0.4273870  0.032900192
##                               PC5         PC6         PC7         PC8
## satisfaction_level   -0.254121334  0.32261547 -0.24348487  0.29096526
## last_evaluation      -0.104336225  0.06469366 -0.52265596 -0.54805156
## number_project        0.096762382 -0.18884268  0.47347561 -0.24135584
## average_montly_hours  0.009167641 -0.25359580 -0.02333746  0.72134010
## time_spend_company   -0.042899061  0.79304840  0.16023409  0.09322204
## Work_accident         0.811963652 -0.06525810 -0.25275425  0.02940217
##                              PC9
## satisfaction_level    0.51053109
## last_evaluation      -0.27336728
## number_project        0.58880888
## average_montly_hours -0.30587487
## time_spend_company   -0.11048404
## Work_accident         0.07010533
summary(datapca)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6     PC7
## Standard deviation     1.3642 1.2102 1.0304 0.9995 0.9775 0.91951 0.84196
## Proportion of Variance 0.2068 0.1627 0.1180 0.1110 0.1062 0.09394 0.07877
## Cumulative Proportion  0.2068 0.3695 0.4875 0.5985 0.7046 0.79858 0.87735
##                            PC8     PC9
## Standard deviation     0.79150 0.69094
## Proportion of Variance 0.06961 0.05304
## Cumulative Proportion  0.94696 1.00000

5.B. Use visualization techniques for PCA

fviz_eig(datapca, addlabels=TRUE, barfill = "red")

ggbiplot(datapca,labels.size = 0,alpha=0.1,ellipse = TRUE,groups = data$left)

### 5.C. Final comments

Açıklamaaaa

6.Apply Logistic Regression or Regression

6.A. Use appropriate functions and arguments

We split our dataset into two tables, trainSet and testSet, with 80% and 20% of the data, respectively.

set.seed(241)
trainIndex <- createDataPartition(data$left, p = .8, list = FALSE)

trainSet <- data[trainIndex, ]
testSet <- data[-trainIndex, ]

After impelementing Generalized Linear Model, we get a summary.

modelLogistic <- glm(left~.,family="binomial", data=trainSet)
summary(modelLogistic)
## 
## Call:
## glm(formula = left ~ ., family = "binomial", data = trainSet)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4184  -0.6786  -0.4309  -0.1477   3.1315  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            0.572945   0.154168   3.716 0.000202 ***
## satisfaction_level    -3.731186   0.098541 -37.864  < 2e-16 ***
## last_evaluation        0.480410   0.104601   4.593 4.37e-06 ***
## number_project        -1.594227   0.117225 -13.600  < 2e-16 ***
## average_montly_hours   0.881358   0.121211   7.271 3.56e-13 ***
## time_spend_company     1.921482   0.134889  14.245  < 2e-16 ***
## Work_accident         -1.528568   0.099376 -15.382  < 2e-16 ***
## promotion_last_5years -1.687754   0.280596  -6.015 1.80e-09 ***
## Departmenthr           0.145235   0.143898   1.009 0.312833    
## DepartmentIT          -0.205707   0.132241  -1.556 0.119816    
## Departmentmanagement  -0.807246   0.171708  -4.701 2.59e-06 ***
## Departmentmarketing   -0.111705   0.142393  -0.784 0.432758    
## Departmentproduct_mng -0.299672   0.143000  -2.096 0.036117 *  
## DepartmentRandD       -0.681839   0.160028  -4.261 2.04e-05 ***
## Departmentsales       -0.052734   0.110742  -0.476 0.633942    
## Departmentsupport      0.007872   0.118154   0.067 0.946883    
## Departmenttechnical   -0.019375   0.115688  -0.167 0.866992    
## salary                -0.027806   0.039570  -0.703 0.482242    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13163  on 11999  degrees of freedom
## Residual deviance: 10619  on 11982  degrees of freedom
## AIC: 10655
## 
## Number of Fisher Scoring iterations: 5

This is the output of a generalized linear model with a binomial family. The coefficients show the relationship between the different features and the target variable “left”. The p-values show the significance of each feature in relation to the target variable.

prob <- predict(modelLogistic , testSet ,type="response")
pred <- ifelse(prob > 0.5, 1, 0)

This is the output of a generalized linear model with a binomial family. The coefficients show the relationship between the different features and the target variable “left”. The p-values show the significance of each feature in relation to the target variable.

matrixLog <- confusionMatrix(
  factor(pred, levels = c(0,1)),
  factor(testSet$left, levels = c(0,1))
)
matrixLog
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2111  511
##          1  170  207
##                                           
##                Accuracy : 0.7729          
##                  95% CI : (0.7575, 0.7878)
##     No Information Rate : 0.7606          
##     P-Value [Acc > NIR] : 0.05851         
##                                           
##                   Kappa : 0.2553          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.9255          
##             Specificity : 0.2883          
##          Pos Pred Value : 0.8051          
##          Neg Pred Value : 0.5491          
##              Prevalence : 0.7606          
##          Detection Rate : 0.7039          
##    Detection Prevalence : 0.8743          
##       Balanced Accuracy : 0.6069          
##                                           
##        'Positive' Class : 0               
## 

6.B. Use visualization techniques for Regression

plot_summs(modelLogistic)

### 6.C Final Comments

Linear regression model is performing moderately well, with an accuracy of 0.7729 and a kappa of 0.2553. It is showing good sensitivity, with a score of 0.9255, but relatively low specificity, with a score of 0.2883. Model Logistic graph plots the predicted probability of success for each column.

7. Apply at least 2 Clustering Techniques

7.A. Describe the reason you choose those 2 techniques.

In the result of PCA’s, We observed that our dataset contains mostly similar values that are close to each other. In based of a distance measure we chose to use K-means to group our data into clusters. In terms of clustering large number of data points like our dataset, We choose Hierarchical clustering to get better results in larger sets.

7.1 K-Means Algorithm Application

7.1.A Use appropriate functions and arguments

left_pc <- prcomp(data[,c(1:8,10)], center = TRUE, scale = TRUE)
summary(left_pc)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6     PC7
## Standard deviation     1.3642 1.2102 1.0304 0.9995 0.9775 0.91951 0.84196
## Proportion of Variance 0.2068 0.1627 0.1180 0.1110 0.1062 0.09394 0.07877
## Cumulative Proportion  0.2068 0.3695 0.4875 0.5985 0.7046 0.79858 0.87735
##                            PC8     PC9
## Standard deviation     0.79150 0.69094
## Proportion of Variance 0.06961 0.05304
## Cumulative Proportion  0.94696 1.00000

7.1.B Use visualization techniques

screeplot(left_pc, type = "l", npcs = 5, main = "Screeplot of the first 5 PCs")
abline(h = 1, col="red", lty="longdash")
legend("topright", legend=c("Eigenvalue = 1"),
       col=c("red"), lty=5, cex=0.6)

plot(left_pc$x[,1],left_pc$x[,2], xlab="PC1 (20.7%)", ylab = "PC2 (16.3%)", main = "PC1 / PC2 - plot")

set.seed(101)
km <- kmeans(data[,1:8,10], 2)
plot(left_pc$x[,1],left_pc$x[,2], xlab="PC1 (20.7%)", 
     ylab = "PC2 (16.3%)", 
     main = "PC1 / PC2 - plot", 
     col=km$cluster)

km$centers
##   satisfaction_level last_evaluation number_project average_montly_hours
## 1          0.6135455       0.5517880      0.3577686            0.4851325
## 2          0.5679486       0.5571902      0.3610912            0.4918627
##   time_spend_company Work_accident       left promotion_last_5years
## 1          0.1882204             1 0.07791609            0.03503919
## 2          0.1871200             0 0.26515978            0.01893998
set.seed(102)
km <- kmeans(data[,1:8,10], 3)
plot(left_pc$x[,1],left_pc$x[,2], xlab="PC1 (20.7%)", 
     ylab = "PC2 (16.3%)", 
     main = "PC1 / PC2 - plot", 
     col=km$cluster)

#### k-means clustering into k=3 clusters and then visualizing the results

table(km$cluster, data$left)
##    
##        0    1
##   1 2000  121
##   2    0 3450
##   3 9428    0
km <- kmeans(data[,1:8,10], 3)
fviz_cluster(km, data = data[,1:8,10])

7.1.C Final comments

The Department column was not included when the PCA was performed to the data. The center points were established, and k was set to 3. The center points were clustered using the kmeans algorithm, which produced the blue, red, and green clusters. The graphic illustrates the visualization of these clusters.

7.1.X. Use the PCA results

Bir takım yorum

left_pc <- prcomp(data[,c(1:8,10)], center = TRUE, scale = TRUE)
summary(left_pc)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6     PC7
## Standard deviation     1.3642 1.2102 1.0304 0.9995 0.9775 0.91951 0.84196
## Proportion of Variance 0.2068 0.1627 0.1180 0.1110 0.1062 0.09394 0.07877
## Cumulative Proportion  0.2068 0.3695 0.4875 0.5985 0.7046 0.79858 0.87735
##                            PC8     PC9
## Standard deviation     0.79150 0.69094
## Proportion of Variance 0.06961 0.05304
## Cumulative Proportion  0.94696 1.00000

7.2. Hierarchical Algorithm Application

7.2.A. Use appropriate functions and arguments

distance <- dist(data[, c(1:6, 8, 10)], method = "euclidean")
cluster_data <- hclust(distance, method = "ward.D")
plot(cluster_data)

rect.hclust(cluster_data, k=3)

sm_hc <- cutree(cluster_data, 3)
clusplot(data[, c(1:6, 8, 10)], clus = sm_hc, lines = 0, shade = TRUE, color = TRUE, labels = 2, plotchar = FALSE, span = TRUE)

7.2.B. Use visualization techniques

#plot(cluster_data)
#clusplot(data[, c(1:6, 8, 10)], clus = sm_hc, lines = 0, shade = TRUE, color = TRUE, labels = 2, plotchar = FALSE, span = TRUE)

7.2.C. Final comments

Bir takım yorumlar yapılabilir!

7.B. Compare the results you have found in 7.1 and 7.2

Burda şöyle oldu böyle oldu.

8. Apply at least 2 Classification Techniques.

8.A. Describe the reason you choose those 2 techniques.

We’ll use the output of step 6 here, so We’re skipping 8.2 here.

8.1. Decision Tree Algorithm Application

8.1.A Use appropriate functions and arguments

model <- rpart(formula = left ~ ., data = trainSet)
prob2 <- predict(model , testSet ,type="vector")
After creating model, we check the levels are they even or not.
levels(as.factor(prob2))
## [1] "0"                  "0.014986568641312"  "0.046031746031746" 
## [4] "0.0654205607476635" "0.0764227642276423" "0.0833333333333333"
## [7] "0.918774966711052"  "0.969507427677873"  "1"
levels(as.factor(testSet$left))
## [1] "0" "1"
We observed that are levels are not even. In order to categorize probilities according 1 or 0 (same levels from testSet). We calculate the average value of the probibility levels and get 0.36.
pred2 <- ifelse(prob > 0.36, 1, 0)
if the prediction value higher from 0.36 turn into 1 else turn into 0
matrixLog2 <- confusionMatrix(
  factor(pred2, levels = c(0,1)),
  factor(testSet$left, levels = c(0,1)))

matrixLog2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1949  251
##          1  332  467
##                                          
##                Accuracy : 0.8056         
##                  95% CI : (0.791, 0.8196)
##     No Information Rate : 0.7606         
##     P-Value [Acc > NIR] : 1.956e-09      
##                                          
##                   Kappa : 0.4861         
##                                          
##  Mcnemar's Test P-Value : 0.0009221      
##                                          
##             Sensitivity : 0.8544         
##             Specificity : 0.6504         
##          Pos Pred Value : 0.8859         
##          Neg Pred Value : 0.5845         
##              Prevalence : 0.7606         
##          Detection Rate : 0.6499         
##    Detection Prevalence : 0.7336         
##       Balanced Accuracy : 0.7524         
##                                          
##        'Positive' Class : 0              
## 

8.1.B Use visualization techniques

rpart.plot(model, box.palette="BuGn", shadow.col="gray", nn=TRUE)

8.1.C Final comments

This conclusion shows a good accuracy score of 80.53%, indicating that the decision tree model was quite successful in predicting the correct classes. The Kappa score of 0.4906 is also good and shows the model is not just predicting the majority class.

8.B. Compare the results you have found in 8.1 and 8.2

The Decision Tree model appears to be more accurate overall, achieving an accuracy of 0.8053 compared to 0.7729 for the Logistic Regression model. The Decision Tree model also has higher specificity than the Logistic Regression model. However, the Logistic Regression model has higher sensitivity, meaning that it is better at identifying positive cases.